In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2, venn3
import os, re, json
from utility_functions import *
DATE
Out[1]:
'20250630'
In [2]:
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
PXDs = [
"PXD002057.v0.11.4",
"PXD005833.v0.11.4",
"PXD014258.v0.11.4"
]
filtering = 'global'
# filtering = 'custom'
# filtering = 'hybrid'
In [3]:
data = []
for dataset_name in PXDs:
data.append(pd.read_csv(os.path.join(working_folder, dataset_name, f"openprot-x-trembl-filt-{filtering}-outerjoin.csv")))
for _ in data:
print(_.shape)
data = pd.concat(data, ignore_index=True)
print(data.shape)
data.tail()
(48597, 64) (164846, 64) (132750, 64) (346193, 64)
Out[3]:
| spectrum_title | scan | spectrum_file | precursor_mass_trembl | database_peptide_trembl | matched_peptide_trembl | modifications_trembl | database_trembl | psm_score_trembl | global_q_trembl | ... | all-explained_open | by-intensity-pattern-correlation_open | top_tag_rank_nterm_open | top_tag_rank_cterm_open | top_tag_rank_open | predicted_retention_time_open | retention_time_error_adjusted_open | Same_peptide | Same_mod_peptide | Same_mods_noRT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 346188 | ESC-HF-SampleHela5:controllerType=0 controller... | 9994 | ESC-HF-SampleHela5 | 1205.651464 | HLSVNDLPVGR | HLSVNDLPVGR | Unmodified | T | 2.325810 | 0.000057 | ... | 0.2528 | 0.7855 | 0.0 | 1.0 | 0.0 | 1023.251153 | 100.694167 | True | True | True |
| 346189 | ESC-HF-SampleHela5:controllerType=0 controller... | 9996 | ESC-HF-SampleHela5 | 1400.780981 | TFIAIKPDGVQR | TFIAIKPDGVQR | 6|[1263]Gly[K](144.07)_or_6|[4]Carbamidomethyl... | T | 0.764866 | 0.001069 | ... | 0.2335 | 0.7031 | 0.0 | 17.0 | 0.0 | 1274.538023 | 150.482963 | True | False | False |
| 346190 | ESC-HF-SampleHela5:controllerType=0 controller... | 9997 | ESC-HF-SampleHela5 | 1324.626854 | KFEEIPIAHIK | KFEEIPIAHIK | Unmodified | T | 1.441000 | 0.000057 | ... | 0.1061 | 0.8813 | 3.0 | 4.0 | 3.0 | 1105.713169 | 14.384891 | True | True | True |
| 346191 | ESC-HF-SampleHela5:controllerType=0 controller... | 9998 | ESC-HF-SampleHela5 | 1436.764379 | GVTFNVTTVDTKR | GVTFNVTTVDTKR | Unmodified | T | 1.822180 | 0.000057 | ... | 0.2252 | 0.6929 | 0.0 | 0.0 | 0.0 | 1159.870875 | 35.633775 | True | True | True |
| 346192 | ESC-HF-SampleHela5:controllerType=0 controller... | 9999 | ESC-HF-SampleHela5 | 1334.689550 | GNEIEPNFSATR | GNEIEPNFSATR | Unmodified | T | 1.931300 | 0.000057 | ... | 0.1597 | 0.7231 | 0.0 | 2.0 | 0.0 | 1285.778264 | 161.457404 | True | True | True |
5 rows × 64 columns
General plot¶
In [4]:
# searches overall overlap
A = data[~data.database_trembl.isna()].spectrum_title
B = data[~data.database_open.isna()].spectrum_title
venn2([set(A),set(B)],
set_labels=['TrEMBL','OpenProt'],
set_colors=[project_palette['trembl'], project_palette['openprot']])
plt.title('Identified spectra overlap (all datasets)')
plt.savefig(f"publication-data/{DATE}-overall-overlap-trembl-openprot-{filtering}-filtering.svg")
In [5]:
len(set(B))/len(set(A))
Out[5]:
0.9581133226523886
In [6]:
F, counts = make_sankey_plot_with_counts(data, suffixes=['_trembl','_open'])
F.write_image(f"publication-data/{DATE}-Sankey-trembl-openprot-{filtering}-filtering.svg")
F.show()
In [7]:
data3 = counts.loc[['Canonical+Unmodified/Expected','Canonical+Unexpected',
'Decoy','Unidentified'],
['Canonical+Unmodified/Expected','Canonical+Unexpected',
'NonCanonical+Unmodified/Expected','NonCanonical+Unexpected',
'Decoy','Unidentified']]
data3.style.background_gradient()
Out[7]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_trembl | ||||||
| Canonical+Unmodified/Expected | 226987 | 1207 | 1243 | 1382 | 534 | 13578 |
| Canonical+Unexpected | 1281 | 78567 | 735 | 1435 | 197 | 6910 |
| Decoy | 199 | 73 | 85 | 72 | 1012 | 2176 |
| Unidentified | 1796 | 1165 | 1837 | 2005 | 1717 | 0 |
In [8]:
# All spectra
tmp = data3.iloc[:,:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
346193 100.0%
Out[8]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_trembl | ||||||
| Canonical+Unmodified/Expected | 226987 | 1207 | 1243 | 1382 | 534 | 13578 |
| Canonical+Unexpected | 1281 | 78567 | 735 | 1435 | 197 | 6910 |
| Decoy | 199 | 73 | 85 | 72 | 1012 | 2176 |
| Unidentified | 1796 | 1165 | 1837 | 2005 | 1717 | 0 |
In [9]:
# Canonical --> Canonical
tmp = data3.iloc[:2,:2]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
308042 89.0%
Out[9]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected |
|---|---|---|
| sankey_trembl | ||
| Canonical+Unmodified/Expected | 226987 | 1207 |
| Canonical+Unexpected | 1281 | 78567 |
In [10]:
# Canonical --> Unidentified
tmp = data3.iloc[:2,-1:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
20488 5.9%
Out[10]:
| sankey_open | Unidentified |
|---|---|
| sankey_trembl | |
| Canonical+Unmodified/Expected | 13578 |
| Canonical+Unexpected | 6910 |
In [11]:
# Canonical --> NonCanonical
tmp = data3.iloc[:2,2:4]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
4795 1.4%
Out[11]:
| sankey_open | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected |
|---|---|---|
| sankey_trembl | ||
| Canonical+Unmodified/Expected | 1243 | 1382 |
| Canonical+Unexpected | 735 | 1435 |
In [12]:
# Any Peptide --> Any Peptide
tmp = data3.iloc[:-1,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
315009 91.0%
Out[12]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy |
|---|---|---|---|---|---|
| sankey_trembl | |||||
| Canonical+Unmodified/Expected | 226987 | 1207 | 1243 | 1382 | 534 |
| Canonical+Unexpected | 1281 | 78567 | 735 | 1435 | 197 |
| Decoy | 199 | 73 | 85 | 72 | 1012 |
In [13]:
# Unidentified --> Canonical
tmp = data3.iloc[[3],:2]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
2961 0.9%
Out[13]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected |
|---|---|---|
| sankey_trembl | ||
| Unidentified | 1796 | 1165 |
In [14]:
# Any --> NonCanonical
tmp = data3.iloc[:,2:4]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
8794 2.5%
Out[14]:
| sankey_open | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected |
|---|---|---|
| sankey_trembl | ||
| Canonical+Unmodified/Expected | 1243 | 1382 |
| Canonical+Unexpected | 735 | 1435 |
| Decoy | 85 | 72 |
| Unidentified | 1837 | 2005 |
Zoom on noncanon¶
In [15]:
F, _ = make_sankey_plot_with_counts(data[(data.isCanonical_open=='NonCanonical')&(data.database_open=='T')],
suffixes=['_trembl','_open'])
F.write_image(f"publication-data/{DATE}-Zoomed-Sankey-trembl-openprot-{filtering}-filtering.svg")
F.show()
In [ ]:
autosave(extra_labels='-'+filtering)
filtering